In [1]:
from preamble import *
% matplotlib notebook

In [2]:
from glob import glob
dfs = []
for file in glob("data/citibike/*.csv"):
    dfs.append(pd.read_csv(file))

In [3]:
data = pd.concat(dfs)

In [4]:
data.columns


Out[4]:
Index(['tripduration', 'starttime', 'stoptime', 'start station id',
       'start station name', 'start station latitude',
       'start station longitude', 'end station id', 'end station name',
       'end station latitude', 'end station longitude', 'bikeid', 'usertype',
       'birth year', 'gender'],
      dtype='object')

In [5]:
data.head()


Out[5]:
tripduration starttime stoptime start station id start station name start station latitude start station longitude end station id end station name end station latitude end station longitude bikeid usertype birth year gender
0 326 2013-10-01 00:01:08 2013-10-01 00:06:34 239 Willoughby St & Fleet St 40.691966 -73.981302 366 Clinton Ave & Myrtle Ave 40.693261 -73.968896 16052 Subscriber 1982 1
1 729 2013-10-01 00:01:21 2013-10-01 00:13:30 322 Clinton St & Tillary St 40.696192 -73.991218 398 Atlantic Ave & Furman St 40.691652 -73.999979 19412 Customer \N 0
2 520 2013-10-01 00:01:24 2013-10-01 00:10:04 174 E 25 St & 1 Ave 40.738177 -73.977387 403 E 2 St & 2 Ave 40.725029 -73.990697 19645 Subscriber 1984 1
3 281 2013-10-01 00:01:25 2013-10-01 00:06:06 430 York St & Jay St 40.701485 -73.986569 323 Lawrence St & Willoughby St 40.692362 -73.986317 16992 Subscriber 1985 1
4 196 2013-10-01 00:01:27 2013-10-01 00:04:43 403 E 2 St & 2 Ave 40.725029 -73.990697 401 Allen St & Rivington St 40.720196 -73.989978 15690 Subscriber 1986 1

In [6]:
data['one'] = 1
data['starttime'] = pd.to_datetime(data.starttime)
data = data.set_index("starttime")

In [7]:
data_resampled = data.groupby("start station id").one.resample("3h").sum()

In [8]:
per_station = data_resampled.unstack(level=0).fillna(0)

In [9]:
plt.figure()
per_station[301].plot()


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3fec194128>

In [10]:
from sklearn.gaussian_process import GaussianProcessRegressor

In [11]:
y = per_station[301].values
X = np.arange(len(y)).reshape(-1, 1)

In [12]:
gp = GaussianProcessRegressor().fit(X, y)

In [13]:
plt.figure()
plt.plot(y, label="y")
plt.plot(gp.predict(X), label="preds")
plt.legend()


Out[13]:
<matplotlib.legend.Legend at 0x7f3fec060400>

In [14]:
gp.kernel_


Out[14]:
1**2 * RBF(length_scale=1)

In [18]:
from sklearn.gaussian_process.kernels import RBF, ExpSineSquared, WhiteKernel
gp = GaussianProcessRegressor(alpha=1, normalize_y=True,
                              kernel = 1.0 * RBF(length_scale_bounds=(2, 500)) + 1.0 * RBF(length_scale_bounds=(50, 1000))
                              + 1.0 * RBF(length_scale=100, length_scale_bounds=(2, 500)) * ExpSineSquared(periodicity=8, periodicity_bounds="fixed")  # + 1.0 * WhiteKernel(noise_level=1)
                              + 1.0 * RBF(length_scale=100, length_scale_bounds=(2, 500)) * ExpSineSquared(periodicity=56, periodicity_bounds="fixed")).fit(X[:1500], y[:1500])

In [20]:
plt.figure()
plt.plot(y, label="y")
plt.plot(gp.predict(X), label="preds")
plt.legend()


Out[20]:
<matplotlib.legend.Legend at 0x7f3feb6c5d68>

In [21]:
gp.kernel_


Out[21]:
1.18**2 * RBF(length_scale=8.34) + 6.15**2 * RBF(length_scale=627) + 5.58**2 * RBF(length_scale=500) * ExpSineSquared(length_scale=0.000298, periodicity=8) + 6.96**2 * RBF(length_scale=37.7) * ExpSineSquared(length_scale=0.0621, periodicity=56)

Exercise

Pick a subset of stations from a particular area of the city. Can you use location information to improve the estimates? Can you make predictions for a station given on other stations?